head(taxi_data)
## # A tibble: 6 × 21
## ...1 trip_distance rate_code store_and_fwd_flag payment_type fare_amount
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 3 17.0 1 N 1 49.5
## 2 4 14.4 1 N 1 45.5
## 3 5 11.6 1 N 1 42
## 4 10 5.1 1 N 1 26.5
## 5 12 11.1 1 N 1 45.5
## 6 13 9.54 1 N 1 41
## # ℹ 15 more variables: extra <dbl>, mta_tax <dbl>, tip_amount <dbl>,
## # tolls_amount <dbl>, imp_surcharge <dbl>, total_amount <dbl>,
## # pickup_location_id <dbl>, dropoff_location_id <dbl>, year <dbl>,
## # month <dbl>, day <dbl>, day_of_week <dbl>, hour_of_day <dbl>,
## # trip_duration <dbl>, calculated_total_amount <dbl>
summary_stats <- summary(taxi_data)
summary_stats
## ...1 trip_distance rate_code store_and_fwd_flag
## Min. : 3 Min. : 0.010 Min. : 1.000 Length:1048575
## 1st Qu.: 270199 1st Qu.: 6.470 1st Qu.: 1.000 Class :character
## Median : 542653 Median : 8.700 Median : 1.000 Mode :character
## Mean : 543661 Mean : 9.093 Mean : 1.111
## 3rd Qu.: 817071 3rd Qu.: 10.920 3rd Qu.: 1.000
## Max. :1090739 Max. :7655.760 Max. :99.000
## payment_type fare_amount extra mta_tax
## Min. :1.000 Min. : 0.01 Min. :-0.4500 Min. :0.0000
## 1st Qu.:1.000 1st Qu.: 24.00 1st Qu.: 0.0000 1st Qu.:0.5000
## Median :1.000 Median : 29.00 Median : 0.0000 Median :0.5000
## Mean :1.105 Mean : 31.91 Mean : 0.3208 Mean :0.4864
## 3rd Qu.:1.000 3rd Qu.: 36.00 3rd Qu.: 0.5000 3rd Qu.:0.5000
## Max. :4.000 Max. : 9999.50 Max. :18.5000 Max. :0.5000
## tip_amount tolls_amount imp_surcharge total_amount
## Min. : 0.000 Min. : 0.000 Min. :0.0 Min. : 0.31
## 1st Qu.: 4.560 1st Qu.: 0.000 1st Qu.:0.3 1st Qu.: 30.35
## Median : 6.150 Median : 0.000 Median :0.3 Median : 38.47
## Mean : 6.287 Mean : 2.268 Mean :0.3 Mean : 41.58
## 3rd Qu.: 8.110 3rd Qu.: 5.760 3rd Qu.:0.3 3rd Qu.: 48.36
## Max. :415.000 Max. :910.900 Max. :0.6 Max. :10001.30
## pickup_location_id dropoff_location_id year month
## Min. : 1 Min. : 1.0 Min. :2018 Min. : 1.000
## 1st Qu.:132 1st Qu.: 87.0 1st Qu.:2018 1st Qu.: 3.000
## Median :138 Median :141.0 Median :2018 Median : 6.000
## Mean :153 Mean :146.9 Mean :2018 Mean : 6.294
## 3rd Qu.:186 3rd Qu.:229.0 3rd Qu.:2018 3rd Qu.: 9.000
## Max. :265 Max. :265.0 Max. :2018 Max. :12.000
## day day_of_week hour_of_day trip_duration
## Min. : 1.00 Min. :0.000 Min. : 0.00 Min. : 1
## 1st Qu.: 9.00 1st Qu.:1.000 1st Qu.:10.00 1st Qu.: 1449
## Median :16.00 Median :3.000 Median :14.00 Median : 1853
## Mean :15.78 Mean :2.936 Mean :13.87 Mean : 2212
## 3rd Qu.:23.00 3rd Qu.:4.000 3rd Qu.:19.00 3rd Qu.: 2329
## Max. :31.00 Max. :6.000 Max. :23.00 Max. :320031
## calculated_total_amount
## Min. : 0.31
## 1st Qu.: 30.35
## Median : 38.47
## Mean : 41.50
## 3rd Qu.: 48.30
## Max. :10001.30
count = c(250000, 500000, 1000000, 1048575)
ggplot(taxi_data, aes(x = trip_distance)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
xlim(0, 20) +
scale_y_continuous(labels = scales::comma)
## Warning: Removed 22541 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

labs(title = "Trip Distance Distribution", x = "Distance (miles)", y = "Count")
## $x
## [1] "Distance (miles)"
##
## $y
## [1] "Count"
##
## $title
## [1] "Trip Distance Distribution"
##
## attr(,"class")
## [1] "labels"
pickup_data <- taxi_data %>%
group_by(pickup_location_id) %>%
summarize(avg_distance = mean(trip_distance))
ggplot(pickup_data, aes(x = pickup_location_id, y = avg_distance, color = "red")) +
geom_line() +
geom_point() +
labs(title = "Average Distance by Pickup ID",
x = "Pickup ID",
y = "Average Distance") +
theme_minimal()

dropoff_data <- taxi_data %>%
group_by(dropoff_location_id) %>%
summarize(avg_distance = mean(trip_distance))
ggplot(dropoff_data, aes(x = dropoff_location_id, y = avg_distance, color = "red")) +
geom_line() +
geom_point() +
labs(title = "Average Distance by Dropoff ID",
x = "Dropoff ID",
y = "Average Distance") +
theme_minimal()

count = c(250000, 500000, 1000000, 1048575)
ggplot(taxi_data, aes(x = pickup_location_id)) +
geom_bar(binwidth = 1, fill = "yellow", color = "black") +
xlim(1, 265) +
scale_y_continuous(labels = scales::comma)
## Warning in geom_bar(binwidth = 1, fill = "yellow", color = "black"): Ignoring
## unknown parameters: `binwidth`
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

labs(title = "Pickup Location Distribution", x = "Pickup ID", y = "Count")
## $x
## [1] "Pickup ID"
##
## $y
## [1] "Count"
##
## $title
## [1] "Pickup Location Distribution"
##
## attr(,"class")
## [1] "labels"
count = c(250000, 500000, 1000000, 1048575)
ggplot(taxi_data, aes(x = dropoff_location_id)) +
geom_bar(binwidth = 1, fill = "yellow", color = "black") +
xlim(1, 265) +
scale_y_continuous(labels = scales::comma)
## Warning in geom_bar(binwidth = 1, fill = "yellow", color = "black"): Ignoring
## unknown parameters: `binwidth`
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

labs(title = "Dropoff Location Distribution", x = "Dropoff ID", y = "Count")
## $x
## [1] "Dropoff ID"
##
## $y
## [1] "Count"
##
## $title
## [1] "Dropoff Location Distribution"
##
## attr(,"class")
## [1] "labels"
taxi_data <- taxi_data %>%
mutate(region = case_when(
pickup_location_id %in% c(3,18,20,31,32,46,47,51,58,59,69,78,81,94,119,126,136,147,159,167,168,169,174,182,183,184,185,199,200,208,212,213,220,235,240,241,242,247,248,250,254,259) ~ "Bronx",
pickup_location_id %in% c(2,7,8,9,10,15,16,19,27,28,30,38,53,56,64,70,73,82,83,86,92,93,95,96,98,101,102,117,121,122,124,129,130,131,132,134,135,138,139,145,146,157,160,171,173,175,179,180,191,192,193,196,197,198,201,203,205,207,215,216,218,219,223,226,252,253,258,260) ~ "Queens",
pickup_location_id %in% c(4,12,13,24,41,42,43,45,48,50,68,74,75,79,87,88,90,100,103,107,113,114,116,120,125,127,128,137,140,141,142,143,144,148,151,152,153,158,161,162,163,164,166,170,186,194,202,209,211,224,229,230,231,232,233,234,236,237,238,239,243,244,246,249,261,262,263) ~ "Manhattan",
pickup_location_id %in%
c(11,14,17,21,22,25,26,29,33,34,35,36,37,39,40,49,52,54,55,61,62,63,65,66,67,71,72,76,77,80,85,89,91,97,106,108,111,112,123,133,149,150,154,155,165,177,178,181,188,189,190,195,210,217,222,225,227,228,255,256,257) ~ "Brooklyn",
pickup_location_id %in%
c(5,6,23,44,84,99,109,110,115,118,156,172,176,187,204,206,214,221,245,251) ~ "Staten Island"
))
ggplot(taxi_data, aes(x = region)) +
geom_bar(fill = "steelblue") +
scale_y_continuous(labels = comma) +
labs(
title = "Number of Trips by Region",
x = "Region",
y = "Trip Count"
) +
theme_minimal()

taxi_data <- taxi_data %>%
mutate(
day_of_week = factor(day_of_week,
levels = 0:6,
labels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
)
region_summary <- taxi_data %>%
group_by(region) %>%
summarise(
avg_trip_distance = mean(trip_distance, na.rm = TRUE),
avg_total_amount = mean(total_amount, na.rm = TRUE),
most_common_day = names(sort(table(day_of_week), decreasing = TRUE))[1],
most_common_hour = as.integer(names(sort(table(hour_of_day), decreasing = TRUE))[1]),
trip_count = n()
)
print(region_summary)
## # A tibble: 6 × 6
## region avg_trip_distance avg_total_amount most_common_day most_common_hour
## <chr> <dbl> <dbl> <chr> <int>
## 1 Bronx 11.1 45.4 Monday 7
## 2 Brooklyn 8.94 37.5 Thursday 8
## 3 Manhattan 7.83 37.5 Wednesday 22
## 4 Queens 11.5 49.1 Sunday 21
## 5 Staten Is… 17.3 81.4 Thursday 13
## 6 <NA> 8.86 47.2 Wednesday 15
## # ℹ 1 more variable: trip_count <int>
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Basic plotly bar chart
region_counts <- taxi_data %>%
count(region)
plot_ly(
data = region_counts,
x = ~region,
y = ~n,
type = "bar",
text = ~paste("Trips:", n),
hoverinfo = "text",
marker = list(color = 'steelblue')
) %>%
layout(
title = "Number of Trips by Region",
xaxis = list(title = "Region"),
yaxis = list(title = "Trip Count", tickformat = ",") # comma format numbers
)
## Warning: Ignoring 1 observations
pickup_grid <- taxi_data %>%
count(day_of_week, hour_of_day)
ggplot(pickup_grid, aes(x = hour_of_day, y = day_of_week, fill = n)) +
geom_tile(color = "white") +
scale_fill_viridis_c(option = "C") +
labs(
title = "NYC Taxi Pickups by Hour and Day",
x = "Hour of Day",
y = "Day of Week",
fill = "Pickup Count"
) +
theme_minimal()
